Library

library(tidyverse)
library(here)      
library(readxl)    
library(janitor)   
library(stringr)   
library(tidyr)     
library(dplyr)
library(knitr)
library(ggplot2)
library(factoextra)
library(gt)
library(naniar)
library(ggcorrplot)
library(corrplot)
library(GGally)
library(randomForest)
library(pdp)
library(e1071)  
library(dplyr) 
library(caret)
library(plotly)
library(stargazer)
library(broom)
library(knitr)
library(kableExtra)
library(grid)
library(gridExtra)
library(patchwork)
library(glmnet)

Data Import

raw_bp <- read.csv(here("ZIPallFiles","AHSnpa11bp.csv"), header=TRUE)
raw_bb <- read.csv(here("ZIPallFiles","AHSnpa11bb.csv"), header=TRUE)
raw_bsp <- read.csv(here("ZIPallFiles","inp13bsp.csv"), header=TRUE)
raw_bcn <- read.csv(here("ZIPallFiles","inp13bcn.csv"), header=TRUE)
raw_bhh <- read.csv(here("ZIPallFiles","inp13bhh.csv"), header=TRUE)

Rename and Created

bp_bb <- inner_join(raw_bp, raw_bb, by = "ABSPID") %>%
  rename(HypertensionStatus = HYPBC,
         BodyMass = SABDYMS,
         SocialStatus = SF2SA1QN,
         PersonID = ABSPID,
         Age = AGEC,
         Gender = SEX,
         BMI = BMISC,
         Systolic = SYSTOL,
         Diastolic = DIASTOL,
         SmokeStatus = SMKSTAT,
         AlcoholPercentage_Day1 = ALCPER1,
         AlcoholPercentage_Day2 = ALCPER2,
         Potassium_Day1 = POTAST1,
         Potassium_Day2 = POTAST2,
         Sodium_Day1 = SODIUMT1,
         Sodium_Day2 = SODIUMT2,
         FiberEnergy_Day1 = FIBRPER1,
         FiberEnergy_Day2 = FIBRPER2,
         CarbonhydrateEnergy_Day1 = CHOPER1,
         CarbonhydrateEnergy_Day2 = CHOPER2,
         EnergyBMR_Day1 = EIBMR1,
         EnergyBMR_Day2 = EIBMR2) %>%
   mutate(Identity = factor(0))

bsp_bhh<- left_join(raw_bsp, raw_bhh,by = c("ABSHID")) %>%
  rename(BodyMass = SABDYMS,
         SocialStatus = SF2SA1DB,
         HouseID = ABSHID,
         Age = AGEEC,
         Gender = SEX,
         BMI = BMISC,
         Systolic = SYSTOL,
         Diastolic = DIASTOL,
         SmokeStatus = SMKSTAT,
         AlcoholPercentage_Day1 = ALCPER1,
         AlcoholPercentage_Day2 = ALCPER2,
         Potassium_Day1 = POTAST1,
         Potassium_Day2 = POTAST2,
         Sodium_Day1 = SODIUMT1,
         Sodium_Day2 = SODIUMT2,
         FiberEnergy_Day1 = FIBRPER1,
         FiberEnergy_Day2 = FIBRPER2,
         CarbonhydrateEnergy_Day1 = CHOPER1,
         CarbonhydrateEnergy_Day2 = CHOPER2,
         EnergyBMR_Day1 = EIBMR1,
         EnergyBMR_Day2 = EIBMR2)%>%
   mutate(Identity = factor(1))

bcn <- raw_bcn %>%
  rename(HouseID = ABSHID,
         MedicalCondition = ICD10ME)

First Filter

bp_bb <- bp_bb %>%
  mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
         EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
         EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
  filter(
    Age >= 18, 
    HypertensionStatus == 5,
    BodyMass != 4,
    (is.na(EnergyBMR) | EnergyBMR >= 0.9))

bsp_bhh <- bsp_bhh %>% 
  mutate(EnergyBMR_Day1 = na_if(EnergyBMR_Day1, 997) %>% na_if(998),
         EnergyBMR_Day2 = na_if(EnergyBMR_Day2, 997) %>% na_if(998),
         EnergyBMR = (EnergyBMR_Day1 + EnergyBMR_Day2)/2) %>%
  filter(
    Age >= 18, 
    BodyMass != 4,
    !HouseID %in% bcn$HouseID[bcn$MedicalCondition %in% c(7, 20)],
    (is.na(EnergyBMR) | EnergyBMR >= 0.9))

Merge

raw_data <- bind_rows(bp_bb,bsp_bhh)

Variable Selection

selected_data <- raw_data %>%
select(
    PersonID, HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, Identity,
    AlcoholPercentage_Day1, AlcoholPercentage_Day2,
    Potassium_Day1, Potassium_Day2, 
    Sodium_Day1, Sodium_Day2,
    FiberEnergy_Day1, FiberEnergy_Day2, 
    CarbonhydrateEnergy_Day1, CarbonhydrateEnergy_Day2
  )

Type Check 1

data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
  gt() %>%
  tab_header(
    title = "Variable Names and Their Types"
  ) %>%
 tab_caption(caption = md("Table 1: Variable types table"))
Table 1: Variable types table
Variable Names and Their Types
Variable Type
PersonID character
HouseID character
SocialStatus integer
Age integer
Gender integer
BMI numeric
Systolic integer
Diastolic integer
SmokeStatus integer
Identity factor
AlcoholPercentage_Day1 numeric
AlcoholPercentage_Day2 numeric
Potassium_Day1 numeric
Potassium_Day2 numeric
Sodium_Day1 numeric
Sodium_Day2 numeric
FiberEnergy_Day1 numeric
FiberEnergy_Day2 numeric
CarbonhydrateEnergy_Day1 numeric
CarbonhydrateEnergy_Day2 numeric

Na values + New Variables + Type Conversion

selected_data <- selected_data %>%
  mutate(
    across(c(Gender, SocialStatus, SmokeStatus), as.factor),
    BMI = na_if(BMI, 98) %>% na_if(99),
    Systolic = na_if(Systolic, 0) %>% na_if(998) %>% na_if(999),
    Diastolic = na_if(Diastolic, 0) %>% na_if(998) %>% na_if(999),
    Potassium = (Potassium_Day1 + Potassium_Day2) / 2, 
    Sodium = (Sodium_Day1 + Sodium_Day2) / 2,
    FiberEnergy = (FiberEnergy_Day1 + FiberEnergy_Day2)/2,
    CarbonhydrateEnergy = (CarbonhydrateEnergy_Day1 + CarbonhydrateEnergy_Day2) / 2,
    AlcoholPercentage = (AlcoholPercentage_Day1 + AlcoholPercentage_Day2) /2,
    PotassiumSodiumRatio = Sodium / Potassium
    ) %>% 
  select(PersonID,HouseID, SocialStatus, Age, Gender, BMI, Systolic, Diastolic, SmokeStatus, AlcoholPercentage, PotassiumSodiumRatio, FiberEnergy, CarbonhydrateEnergy, Identity)

Type Check 2

data.frame(Variable = names(selected_data), Type = sapply(selected_data, class)) %>%
  gt() %>%
  tab_header(
    title = "Variable Names and Their Types"
  ) %>%
 tab_caption(caption = md("Table 2: Variable types table after correction"))
Table 2: Variable types table after correction
Variable Names and Their Types
Variable Type
PersonID character
HouseID character
SocialStatus factor
Age integer
Gender factor
BMI numeric
Systolic integer
Diastolic integer
SmokeStatus factor
AlcoholPercentage numeric
PotassiumSodiumRatio numeric
FiberEnergy numeric
CarbonhydrateEnergy numeric
Identity factor

Duplicate Values

selected_data %>% 
  select(-PersonID) %>% 
  distinct() %>% 
  dim() %>% 
  tibble::tibble(Dimension = c("Rows", "Columns"), Count = .) %>% 
  gt() %>%
  tab_header(title = "Dimensions of Unique Tibble") %>%
  tab_caption(caption = md("Table 3: Dimensions of Unique Data Table"))
Table 3: Dimensions of Unique Data Table
Dimensions of Unique Tibble
Dimension Count
Rows 8449
Columns 13

Outliers and New Variables

normal_ranges <- list(Systolic = c(90, 180),
                      Diastolic = c(60, 120),
                      BMI = c(16, 47.5), 
                      PotassiumSodiumRatio = c(0, 4), 
                      AlcoholPercentage = c(0, 85), 
                      CarbonhydrateEnergy = c(0, 100), 
                      FiberEnergy = c(0, 100))

selected_data <- selected_data %>%
  filter(
    (is.na(Systolic) | (Systolic >= normal_ranges$Systolic[1] & Systolic <= normal_ranges$Systolic[2])),
    (is.na(Diastolic) | (Diastolic >= normal_ranges$Diastolic[1] & Diastolic <= normal_ranges$Diastolic[2])),
    (is.na(BMI) | (BMI >= normal_ranges$BMI[1] & BMI <= normal_ranges$BMI[2])),
    (is.na(AlcoholPercentage) | (AlcoholPercentage >= normal_ranges$AlcoholPercentage[1] & AlcoholPercentage <= normal_ranges$AlcoholPercentage[2])),
    (is.na(CarbonhydrateEnergy) | (CarbonhydrateEnergy >= normal_ranges$CarbonhydrateEnergy[1] & CarbonhydrateEnergy <= normal_ranges$CarbonhydrateEnergy[2])),
    (is.na(FiberEnergy) | (FiberEnergy >= normal_ranges$FiberEnergy[1] & FiberEnergy <= normal_ranges$FiberEnergy[2])),
    (is.na(PotassiumSodiumRatio) | (PotassiumSodiumRatio >= normal_ranges$PotassiumSodiumRatio[1] & PotassiumSodiumRatio <= normal_ranges$PotassiumSodiumRatio[2])))

Categrocial Distribution

custom_colors <- c("#540d6e", "#ee4266","#ffd23f","#3bceac","#0ead69")

categorical_vars <- selected_data %>%
  select(where(is.factor)) %>%
  names()

long_data <- selected_data %>%
  pivot_longer(cols = all_of(categorical_vars), names_to = "Variable", values_to = "Value")

plot <- ggplot(long_data, aes(x = Value, fill = Variable)) +
  geom_bar(aes(y = after_stat(prop), group = Variable), position = "dodge", show.legend = FALSE) +
  facet_wrap(~ Variable, scales = "free", nrow = 2) +
  scale_fill_manual(values = custom_colors) +
  labs(title = "Overall Distribution of Categorical Variables",
       y = "Proportion",
       caption = "Figure 1: Proportions of Categorical Variables") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

print(plot)

Merge Variable

selected_data <- selected_data %>% 
  mutate(SmokeStatus = recode_factor(SmokeStatus, "2" = "1", "3" = "1","4" = "2","5" = "3"))

Missing Values

selected_data %>% 
  select(-PersonID, -HouseID) %>% 
  miss_var_summary() %>% 
  gt() %>%
  tab_header(title = "Missing Value") %>%
  tab_caption(caption = md("Table 4: Missing Values in Selected Data")) %>%
  cols_label(variable = "Variable Name", n_miss = "Missing Count", pct_miss = "Percentage Missing") %>%
  fmt_number(columns = everything(), decimals = 2)
Table 4: Missing Values in Selected Data
Missing Value
Variable Name Missing Count Percentage Missing
BMI 1,242.00 15.7
Systolic 1,222.00 15.4
Diastolic 1,222.00 15.4
SocialStatus 0.00 0
Age 0.00 0
Gender 0.00 0
SmokeStatus 0.00 0
AlcoholPercentage 0.00 0
PotassiumSodiumRatio 0.00 0
FiberEnergy 0.00 0
CarbonhydrateEnergy 0.00 0
Identity 0.00 0
corr_matrix <- selected_data%>%
  select(where(is.numeric)) %>%
  mutate(across(everything(), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .))) %>%
  cor(use = "complete.obs")

ggcorrplot(corr_matrix, lab = TRUE,
           title = "Correlation Heatmap for ALL",
           colors = c("blue", "white", "red"),
           tl.cex = 9, lab_size = 4) +
  theme(legend.position = "none") +
  labs(caption = "Figure 2: Correlation Heatmap for ALL Variables")

Regression Imputation

selected_data$BMI[is.na(selected_data$BMI)] <- lm(BMI ~ Gender + SocialStatus + Age + PotassiumSodiumRatio + FiberEnergy + CarbonhydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% 
  predict(newdata = selected_data[is.na(selected_data$BMI), ])

selected_data$Systolic[is.na(selected_data$Systolic)] <- lm(Systolic ~ Gender + SocialStatus + Age + BMI + PotassiumSodiumRatio + FiberEnergy + CarbonhydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>% 
  predict(newdata = selected_data[is.na(selected_data$Systolic), ])

selected_data$Diastolic[is.na(selected_data$Diastolic)] <- lm(Diastolic ~ Gender + SocialStatus + Age + BMI + Systolic + PotassiumSodiumRatio + FiberEnergy + CarbonhydrateEnergy + AlcoholPercentage + SmokeStatus + Identity, data = selected_data, na.action = na.exclude) %>%
  predict(newdata = selected_data[is.na(selected_data$Diastolic), ])

vis_miss(selected_data %>% 
           select(-PersonID, -HouseID)) +
  ggtitle("Missing Data Heatmap for Selected Variables") +
  labs(caption = "Figure 3: Visualization of Missing Data for Selected Variables")

# Hypertension variable

selected_data <- selected_data %>%  
  mutate(Hypertension = as.factor(case_when(Systolic >= 120 | Diastolic >= 80 ~ 1,
                                            TRUE ~ 0)))

Normalization

numeric_cols <- sapply(selected_data, is.numeric)  
normalized_data <- selected_data
normalized_data[numeric_cols] <- scale(selected_data[numeric_cols])

Multicollinearity

selected_data %>%
  select(where(is.numeric), -Systolic, -Diastolic) %>%
  ggscatmat() +
  ggtitle("Scatter Matrix of Numeric Variables") +
  labs(caption = "Figure 4: Scatter Matrix of Numeric Variables")

Logistic Regression

lr <- glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, 
           data = normalized_data, 
           family = binomial())
summary(lr)
## 
## Call:
## glm(formula = Hypertension ~ SocialStatus + Gender + Age + BMI + 
##     SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + 
##     Identity + PotassiumSodiumRatio, family = binomial(), data = normalized_data)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)           0.95072    0.07782  12.217  < 2e-16 ***
## SocialStatus2        -0.07198    0.07867  -0.915  0.36024    
## SocialStatus3        -0.16746    0.08156  -2.053  0.04006 *  
## SocialStatus4        -0.16674    0.08641  -1.930  0.05364 .  
## SocialStatus5        -0.26239    0.08307  -3.159  0.00159 ** 
## Gender2              -0.83919    0.05232 -16.039  < 2e-16 ***
## Age                   0.77836    0.03010  25.855  < 2e-16 ***
## BMI                   0.50127    0.02795  17.935  < 2e-16 ***
## SmokeStatus2         -0.23045    0.07219  -3.192  0.00141 ** 
## SmokeStatus3          0.01697    0.06605   0.257  0.79722    
## AlcoholPercentage     0.13033    0.02743   4.752 2.02e-06 ***
## FiberEnergy          -0.01895    0.03585  -0.529  0.59703    
## CarbonhydrateEnergy  -0.07369    0.03297  -2.235  0.02539 *  
## Identity1             0.04106    0.07537   0.545  0.58591    
## PotassiumSodiumRatio -0.01901    0.02685  -0.708  0.47903    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10865.6  on 7931  degrees of freedom
## Residual deviance:  9196.2  on 7917  degrees of freedom
## AIC: 9226.2
## 
## Number of Fisher Scoring iterations: 3

Random Forest

set.seed(3888)

rf <- randomForest(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data, ntree = 500)

Support Vector Machine

final_data <- cbind(as.data.frame(predict(dummyVars(~ . - Hypertension - PersonID - HouseID - Systolic - Diastolic, data = normalized_data), 
                                          newdata = normalized_data)), 
                    Hypertension = normalized_data$Hypertension)

set.seed(3888)

svm <- svm(Hypertension ~ ., 
           data = final_data,  
           kernel = "radial", cost = 1, gamma = 0.1)  

Model comparison (5-fold cross validation)

set.seed(3888)
k_folds <- 5
n_repeats <- 3

logistic_accuracy <- numeric(k_folds * n_repeats)
logistic_f1 <- numeric(k_folds * n_repeats)
logistic_sensitivity <- numeric(k_folds * n_repeats)
logistic_specificity <- numeric(k_folds * n_repeats)
logistic_precision <- numeric(k_folds * n_repeats)

rf_accuracy <- numeric(k_folds * n_repeats)
rf_f1 <- numeric(k_folds * n_repeats)
rf_sensitivity <- numeric(k_folds * n_repeats)
rf_specificity <- numeric(k_folds * n_repeats)
rf_precision <- numeric(k_folds * n_repeats)

svm_accuracy <- numeric(k_folds * n_repeats)
svm_f1 <- numeric(k_folds * n_repeats)
svm_sensitivity <- numeric(k_folds * n_repeats)
svm_specificity <- numeric(k_folds * n_repeats)
svm_precision <- numeric(k_folds * n_repeats)

ensure_factor_levels <- function(predictions, reference) {
  factor(predictions, levels = levels(reference))
}

for (repeat_idx in 1:n_repeats) {
  folds <- createFolds(normalized_data$Hypertension, k = k_folds, list = TRUE, returnTrain = FALSE)
  
  for (i in 1:k_folds) {
    fold_idx <- (repeat_idx - 1) * k_folds + i

    test_indices <- folds[[i]]
    test_data <- normalized_data[test_indices, ]
    train_data <- normalized_data[-test_indices, ]

    log_model <- glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, 
                     data = train_data, family = binomial())
    log_predictions <- predict(log_model, test_data, type = "response")
    log_class <- ifelse(log_predictions > 0.5, 1, 0)
    log_class <- ensure_factor_levels(log_class, test_data$Hypertension)

    log_conf_matrix <- confusionMatrix(log_class, test_data$Hypertension, positive = "1")
    logistic_accuracy[fold_idx] <- log_conf_matrix$overall['Accuracy']
    logistic_f1[fold_idx] <- log_conf_matrix$byClass['F1']
    logistic_sensitivity[fold_idx] <- log_conf_matrix$byClass['Sensitivity']
    logistic_specificity[fold_idx] <- log_conf_matrix$byClass['Specificity']
    logistic_precision[fold_idx] <- log_conf_matrix$byClass['Precision']

    rf_model <- randomForest(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, 
                             data = train_data, ntree = 500)
    
    rf_predictions <- predict(rf_model, test_data)
    rf_predictions <- ensure_factor_levels(rf_predictions, test_data$Hypertension)

    rf_conf_matrix <- confusionMatrix(rf_predictions, test_data$Hypertension, positive = "1")
    rf_accuracy[fold_idx] <- rf_conf_matrix$overall['Accuracy']
    rf_f1[fold_idx] <- rf_conf_matrix$byClass['F1']
    rf_sensitivity[fold_idx] <- rf_conf_matrix$byClass['Sensitivity']
    rf_specificity[fold_idx] <- rf_conf_matrix$byClass['Specificity']
    rf_precision[fold_idx] <- rf_conf_matrix$byClass['Precision']

    dummy_model <- dummyVars(~ . - Hypertension - PersonID - HouseID - Systolic - Diastolic, data = train_data)
    encoded_train_data <- as.data.frame(predict(dummy_model, newdata = train_data))
    encoded_test_data <- as.data.frame(predict(dummy_model, newdata = test_data))
    encoded_train_data$Hypertension <- train_data$Hypertension
    encoded_test_data$Hypertension <- test_data$Hypertension

    svm_model <- svm(Hypertension ~ ., data = encoded_train_data, kernel = "radial", cost = 1, gamma = 0.1)
    svm_predictions <- predict(svm_model, encoded_test_data)
    svm_predictions <- ensure_factor_levels(svm_predictions, encoded_test_data$Hypertension)

    svm_conf_matrix <- confusionMatrix(svm_predictions, encoded_test_data$Hypertension, positive = "1")
    svm_accuracy[fold_idx] <- svm_conf_matrix$overall['Accuracy']
    svm_f1[fold_idx] <- svm_conf_matrix$byClass['F1']
    svm_sensitivity[fold_idx] <- svm_conf_matrix$byClass['Sensitivity']
    svm_specificity[fold_idx] <- svm_conf_matrix$byClass['Specificity']
    svm_precision[fold_idx] <- svm_conf_matrix$byClass['Precision']
  }
}

ave_logistic_accuracy <- mean(logistic_accuracy)
ave_logistic_f1 <- mean(logistic_f1)
ave_logistic_sensitivity <- mean(logistic_sensitivity)
ave_logistic_specificity <- mean(logistic_specificity)
ave_logistic_precision <- mean(logistic_precision)

ave_rf_accuracy <- mean(rf_accuracy)
ave_rf_f1 <- mean(rf_f1)
ave_rf_sensitivity <- mean(rf_sensitivity)
ave_rf_specificity <- mean(rf_specificity)
ave_rf_precision <- mean(rf_precision)

ave_svm_accuracy <- mean(svm_accuracy)
ave_svm_f1 <- mean(svm_f1)
ave_svm_sensitivity <- mean(svm_sensitivity)
ave_svm_specificity <- mean(svm_specificity)
ave_svm_precision <- mean(svm_precision)
results_df <- data.frame(
    Model = c("Logistic Regression", "Random Forest", "SVM"),
    Average_Accuracy = c(ave_logistic_accuracy, ave_rf_accuracy, ave_svm_accuracy),
    Average_F1 = c(ave_logistic_f1, ave_rf_f1, ave_svm_f1),
    Average_Sensitivity = c(ave_logistic_sensitivity, ave_rf_sensitivity, ave_svm_sensitivity),
    Average_Specificity = c(ave_logistic_specificity, ave_rf_specificity, ave_svm_specificity),
    Average_Precision = c(ave_logistic_precision, ave_rf_precision, ave_svm_precision)
)

kable(results_df, format = "markdown", caption = "Table 5:Model Performance Metrics") %>%
    kable_styling("striped", full_width = F)
Table 5:Model Performance Metrics
Model Average_Accuracy Average_F1 Average_Sensitivity Average_Specificity Average_Precision
Logistic Regression 0.7044885 0.7481466 0.7783494 0.6089291 0.7202971
Random Forest 0.6956228 0.7396327 0.7665783 0.6038241 0.7147264
SVM 0.6991105 0.7455815 0.7818504 0.5920600 0.7127107
results_long <- results_df %>%
  pivot_longer(cols = starts_with("Average_"), 
               names_to = "Metric", 
               values_to = "Value") %>%
  mutate(Metric = sub("Average_", "", Metric)) 

colors <- c('#90F1EF', '#ffd6e0', '#ffef9f')

fig <- plot_ly(results_long, x = ~Metric, y = ~round(Value, 3), type = 'bar', color = ~Model, colors = colors,
               text = ~round(Value, 3), textposition = 'outside', textfont = list(color = 'black')) %>%
  layout(
    title = "Figure 5: Model Performance Metrics Comparison",
    yaxis = list(title = 'Score', range = c(0.5, 0.8)), 
    barmode = 'group',
    paper_bgcolor = 'rgba(245, 246, 249, 1)',
    showlegend = TRUE, # Show legend for models
    margin = list(l = 50, r = 50, b = 100, t = 100, pad = 4), 
    plot_bgcolor = 'rgba(0, 0, 0, 0)' 
  )

fig

Feature Selection

Group by Age

normalized_data_young <- selected_data %>%
  filter(Age < 50)
normalized_data_old <- selected_data %>%
  filter(Age >= 50)

numeric_cols <- sapply(normalized_data_young, is.numeric)  
normalized_data_young[numeric_cols] <- scale(normalized_data_young[numeric_cols])

numeric_cols <- sapply(normalized_data_old, is.numeric)  
normalized_data_old[numeric_cols] <- scale(normalized_data_old[numeric_cols])

Group by Varibles

Set1

lr1 = glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data, family = binomial())

lr2 = glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,data = normalized_data_young, family = binomial())

lr3 = glm(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data_old, family = binomial())
models1 <- list(lr1, lr2, lr3)
model_names1 <- c("Logistic Regression Model A(All, All)", 
                 "Logistic Regression Model B(All, Young)", 
                 "Logistic Regression Model C(All, Old)")
coef_plots1 <- list()
pvalue_plots1 <- list()

for (i in seq_along(models1)) {
  model <- models1[[i]]
  
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  p <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") + 
    coord_flip() +
    labs(title = model_names1[i]) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  coef_plots1[[i]] <- p
}

for (i in seq_along(models1)) {
  model <- models1[[i]]
  
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names1[i], y = "P-Value") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  pvalue_plots1[[i]] <- p
}

combined_coef_plot1 <- wrap_plots(coef_plots1, ncol = 3) + 
  plot_annotation(title = "Logistic Regression Coefficient Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_pvalue_plot2 <- wrap_plots(pvalue_plots1, ncol = 3) + 
  plot_annotation(title = "Logistic Regression P-Value Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_plot1 <- wrap_plots(combined_coef_plot1, combined_pvalue_plot2, nrow = 2)

combined_plot1

Set2

lr4 = glm(Hypertension ~ SocialStatus + SmokeStatus + Gender + AlcoholPercentage + Gender + BMI + FiberEnergy + CarbonhydrateEnergy + Age,data = normalized_data, family = binomial())

lr5 = glm(Hypertension ~ SocialStatus + SmokeStatus + Gender + BMI + AlcoholPercentage + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_young, family = binomial())

lr6 = glm(Hypertension ~ SocialStatus + Gender + BMI + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_old, family = binomial())
models2 <- list(lr4, lr5, lr6)
model_names2 <- c("Logistic Regression Model a(Few, All)", 
                 "Logistic Regression Model b(Few, Young)", 
                 "Logistic Regression Model c(Few, Old)")
coef_plots2 <- list()
pvalue_plots2 <- list()

for (i in seq_along(models2)) {
  model <- models2[[i]]
  
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  p <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") + 
    coord_flip() +
    labs(title = model_names2[i]) +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  coef_plots2[[i]] <- p
}

for (i in seq_along(models1)) {
  model <- models1[[i]]
  
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names2[i], y = "P-Value") +
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))
  
  pvalue_plots2[[i]] <- p
}

combined_coef_plot3 <- wrap_plots(coef_plots2, ncol = 3) + 
  plot_annotation(title = "Logistic Regression Coefficient Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_pvalue_plot4 <- wrap_plots(pvalue_plots2, ncol = 3) + 
  plot_annotation(title = "Logistic Regression P-Value Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_plot2 <- wrap_plots(combined_coef_plot3, combined_pvalue_plot4, nrow = 2)

combined_plot2

Set3

lr7 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy,data = normalized_data, family = binomial())

lr8 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_young, family = binomial())

lr9 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_old, family = binomial())
models3 <- list(lr7, lr8, lr9)
model_names3 <- c("Logistic Regression Model i(Two, All)", 
                 "Logistic Regression Model ii(Two, Young)", 
                 "Logistic Regression Model iii(Two, Old)")
pvalue_plots3 <- list()
coef_plots3 <- list()

# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models3)) {
  model <- models3[[i]]
  
  # Create a data frame of coefficients and p-values
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  # Create the p-value plot for the current model
  p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +  # Use p-values here
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names3[i], y = "P-Value") +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the p-value plot in the list
  pvalue_plots3[[i]] <- p
  
  # Create the coefficient plot for the current model
  coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names3[i]) +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the coefficient plot in the list
  coef_plots3[[i]] <- coef_plot
}

combined_coef_plot5 <- wrap_plots(coef_plots3, ncol = 3) + 
  plot_annotation(title = "Logistic Regression Coefficient Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_pvalue_plot6 <- wrap_plots(pvalue_plots3, ncol = 3) + 
  plot_annotation(title = "Logistic Regression P-Value Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_plot3 <- wrap_plots(combined_coef_plot5, combined_pvalue_plot6, nrow = 2)

combined_plot3

Full Plot

wrap_plots(combined_coef_plot1,
           combined_coef_plot3, 
           combined_coef_plot5, nrow = 3)

wrap_plots(combined_pvalue_plot2,
           combined_pvalue_plot4, 
           combined_pvalue_plot6, nrow = 3)

Lasso

x <- model.matrix(Hypertension ~ SocialStatus + Gender + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, 
                  data = normalized_data)[, -1]

y <- normalized_data$Hypertension

lasso_model <- cv.glmnet(x, y, family = "binomial", alpha = 1)

best_lambda <- lasso_model$lambda.min
print(best_lambda)
## [1] 0.002258056
lasso_coefficients <- coef(lasso_model, s = best_lambda)
print(lasso_coefficients)
## 15 x 1 sparse Matrix of class "dgCMatrix"
##                                s1
## (Intercept)           0.869479944
## SocialStatus2         .          
## SocialStatus3        -0.081628276
## SocialStatus4        -0.076872408
## SocialStatus5        -0.174128820
## Gender2              -0.808526376
## Age                   0.759028895
## BMI                   0.487845075
## SmokeStatus2         -0.209441021
## SmokeStatus3          .          
## AlcoholPercentage     0.116352372
## FiberEnergy          -0.011067435
## CarbonhydrateEnergy  -0.070421423
## Identity1             0.048984935
## PotassiumSodiumRatio -0.004548014

Group By Gender

normalized_data_male <- normalized_data %>%
  filter(Gender == 1)
normalized_data_female <- normalized_data %>%
  filter(Gender == 2)

Set4

lr10 = glm(Hypertension ~ SocialStatus + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio,data = normalized_data_male, family = binomial())

lr11 = glm(Hypertension ~ SocialStatus + Age + BMI + SmokeStatus + AlcoholPercentage + FiberEnergy + CarbonhydrateEnergy + Identity + PotassiumSodiumRatio, data = normalized_data_female, family = binomial())
models4 <- list(lr1, lr10, lr11)
model_names4 <- c("Logistic Regression Model D(All, All)", 
                 "Logistic Regression Model E(All, Male)", 
                 "Logistic Regression Model F(All, Female")
pvalue_plots4 <- list()
coef_plots4 <- list()

# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models4)) {
  model <- models4[[i]]
  
  # Create a data frame of coefficients and p-values
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  # Create the p-value plot for the current model
  p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +  # Use p-values here
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names4[i], y = "P-Value") +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the p-value plot in the list
  pvalue_plots4[[i]] <- p
  
  # Create the coefficient plot for the current model
  coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names4[i]) +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the coefficient plot in the list
  coef_plots4[[i]] <- coef_plot
}

combined_coef_plot7 <- wrap_plots(coef_plots4, ncol = 3) + 
  plot_annotation(title = "Logistic Regression Coefficient Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_pvalue_plot8 <- wrap_plots(pvalue_plots4, ncol = 3) + 
  plot_annotation(title = "Logistic Regression P-Value Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_plot4 <- wrap_plots(combined_coef_plot7, combined_pvalue_plot8, nrow = 2)

combined_plot4

Set5

lr12 = glm(Hypertension ~ SocialStatus + SmokeStatus + BMI + AlcoholPercentage + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_male, family = binomial())

lr13 = glm(Hypertension ~ SocialStatus + BMI + Age + FiberEnergy + CarbonhydrateEnergy, data = normalized_data_female, family = binomial())
models5 <- list(lr4, lr12, lr13)
model_names5 <- c("Logistic Regression Model d(Few, All)", 
                 "Logistic Regression Model e(Few, Male)", 
                 "Logistic Regression Model f(Few, Female)")
pvalue_plots5 <- list()
coef_plots5 <- list()

# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models5)) {
  model <- models5[[i]]
  
  # Create a data frame of coefficients and p-values
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  # Create the p-value plot for the current model
  p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +  # Use p-values here
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names5[i], y = "P-Value") +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the p-value plot in the list
  pvalue_plots5[[i]] <- p
  
  # Create the coefficient plot for the current model
  coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names5[i]) +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the coefficient plot in the list
  coef_plots5[[i]] <- coef_plot
}

combined_coef_plot9 <- wrap_plots(coef_plots5, ncol = 3) + 
  plot_annotation(title = "Logistic Regression Coefficient Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_pvalue_plot10 <- wrap_plots(pvalue_plots5, ncol = 3) + 
  plot_annotation(title = "Logistic Regression P-Value Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_plot5 <- wrap_plots(combined_coef_plot9, combined_pvalue_plot10, nrow = 2)

combined_plot5

### Set6

lr14 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_male, family = binomial())

lr15 = glm(Hypertension ~ FiberEnergy + CarbonhydrateEnergy, data = normalized_data_female, family = binomial())
models6 <- list(lr7, lr14, lr15)
model_names6 <- c("Logistic Regression Model iv(Two, All)", 
                 "Logistic Regression Model v(Two, Male)", 
                 "Logistic Regression Model vi(Two, Female)")
pvalue_plots6 <- list()
coef_plots6 <- list()

# Loop through each model to generate the p-value and coefficient plots
for (i in seq_along(models6)) {
  model <- models6[[i]]
  
  # Create a data frame of coefficients and p-values
  coef_df <- as.data.frame(coef(summary(model)))
  names(coef_df)[which(names(coef_df) == "Std. Error")] <- "Std_Error"
  coef_df$Variable <- rownames(coef_df)
  
  # Create the p-value plot for the current model
  p <- ggplot(coef_df, aes(x = Variable, y = `Pr(>|z|)`)) +  # Use p-values here
    geom_point() +
    geom_hline(yintercept = 0.05, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names6[i], y = "P-Value") +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the p-value plot in the list
  pvalue_plots6[[i]] <- p
  
  # Create the coefficient plot for the current model
  coef_plot <- ggplot(coef_df, aes(x = Variable, y = Estimate)) +
    geom_point() +
    geom_errorbar(aes(ymin = Estimate - Std_Error, ymax = Estimate + Std_Error)) +
    geom_hline(yintercept = 0, linetype = "dashed", color = "red") +
    coord_flip() +
    labs(title = model_names6[i]) +  # Title for each model
    theme_minimal() +
    theme(plot.title = element_text(hjust = 0.5))  # Center the title
  
  # Store the coefficient plot in the list
  coef_plots6[[i]] <- coef_plot
}

combined_coef_plot11 <- wrap_plots(coef_plots6, ncol = 3) + 
  plot_annotation(title = "Logistic Regression Coefficient Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_pvalue_plot12 <- wrap_plots(pvalue_plots6, ncol = 3) + 
  plot_annotation(title = "Logistic Regression P-Value Plots", 
                  theme = theme(plot.title = element_text(size = 16, face = "bold")))

combined_plot6 <- wrap_plots(combined_coef_plot11, combined_pvalue_plot12, nrow = 2)

combined_plot6

Full Plot

wrap_plots(combined_coef_plot7,
           combined_coef_plot9, 
           combined_coef_plot11, nrow = 3)

wrap_plots(combined_pvalue_plot8,
           combined_pvalue_plot10, 
           combined_pvalue_plot12, nrow = 3)